import visual_behavior_glm.src.GLM_params as glm_params
import visual_behavior_glm.src.GLM_analysis_tools as gat
from visual_behavior_glm.src.glm import GLM
import matplotlib.pyplot as plt
import visual_behavior.data_access.loading as loading
import visual_behavior.database as db
import plotly.express as px
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
%matplotlib inline
results_all = gat.retrieve_results(results_type='full')
results_all['glm_version'].unique()
#use v4
rs = gat.retrieve_results(search_dict = {'glm_version': '4_L2_optimize_by_cell'}, results_type='summary')
len(rs)
rs['identifier'] = rs['ophys_experiment_id'].astype(str) + '_' + rs['cell_specimen_id'].astype(str)
rs
model_output_type = 'fraction_change_from_full'
rsp = rs.pivot(index='identifier',columns='dropout',values=model_output_type).reset_index()
rsp
tmp = 'variance_explained'
ve = rs.pivot(index='identifier',columns='dropout',values=tmp).reset_index()
ve
tmp = ve.rename(columns={'Full':'varience_explained_full_model'})
rsp = rsp.merge(tmp[['identifier','varience_explained_full_model']], on=['identifier'])
# cells_to_include = ve[ve['Full']>0.01].identifier.values
# order = np.argsort(ve[ve.identifier.isin(cells_to_include)==True]['Full'])
# cell_order = cells_to_include[order]
# len(cells_to_include)
# rsp = rsp[rsp.identifier.isin(cells_to_include)==True]
# len(rsp)
rspm = rsp.merge(rs[['identifier','cre_line','session_type','equipment_name']].drop_duplicates(),left_on='identifier',right_on='identifier',how='inner')
rspm
def map_session_types(session_type):
session_id = session_type[6:7]
return session_id
rspm['session_id'] = rspm['session_type'].map(lambda st:map_session_types(st))
rspm['session_id'].unique()
# save = True
# if save:
# rspm.to_csv('/allen/programs/braintv/workgroups/nc-ophys/visual_behavior/ophys_glm/fraction_change_var_explained_v_4_L2_fixed_lambda=1_filtered_2020.08.11.csv', index=False)
cols_for_clustering = [col for col in rspm.columns if col not in ['identifier','cre_line','session_type','equipment_name', 'session_id']]
cols_for_clustering = [col for col in cols_for_clustering if col not in ['image0','image1','image2','image3','image4','image5','image6','image7',
'visual']]
cols_for_clustering
cols_for_clustering = [
'omissions',
'all-images',
'image_expectation',
'change',
'hits',
'misses',
'correct_rejects',
'false_alarms',
'post_lick_bouts',
'post_licks',
'pre_lick_bouts',
'pre_licks',
'rewards',
'pupil',
'running',
'time',
'model_bias',
'model_omissions1',
'model_task0',
'model_timing1D',
]
feature_matrix = rspm[cols_for_clustering]
fig, ax = plt.subplots(figsize=(6,10))
ax = sns.heatmap(feature_matrix, vmin=-0.5, vmax=0.5, cmap='RdBu_r', ax=ax, cbar_kws={'label':model_output_type})
ax.set_ylabel('cells')
ax.set_title('GLM feature matrix')
# sorted by overall variance explaned
feature_matrix = rspm.sort_values('varience_explained_full_model').reset_index()[cols_for_clustering]
fig, ax = plt.subplots(figsize=(6,10))
ax = sns.heatmap(feature_matrix, vmin=-0.5, vmax=0.5, cmap='RdBu_r', ax=ax, cbar_kws={'label':model_output_type})
ax.set_ylabel('cells')
ax.set_title('GLM feature matrix')
# sorted by omission dropout
feature_matrix = rspm.sort_values('omissions').reset_index()[cols_for_clustering]
fig, ax = plt.subplots(figsize=(6,10))
ax = sns.heatmap(feature_matrix, vmin=-0.5, vmax=0.5, cmap='RdBu_r', ax=ax, cbar_kws={'label':model_output_type})
ax.set_ylabel('cells')
ax.set_title('GLM feature matrix')
# create kmeans object
kmeans = KMeans(n_clusters=4)
# fit kmeans object to data
kmeans.fit(rspm[cols_for_clustering])
rspm['k-means_cluster_id'] = kmeans.fit_predict(rspm[cols_for_clustering])
rspm['k-means_cluster_id'].value_counts()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(rspm[cols_for_clustering])
rspm['tsne-2d-one'] = tsne_results[:,0]
rspm['tsne-2d-two'] = tsne_results[:,1]
rspm['session_type'].unique()
fig,ax=plt.subplots()
ax.hist(rspm['omissions'],bins=np.arange(-1,1,0.01))
ax.axvline(0,color='black')
ax.set_xlabel('omissions dropout fractional change in var explained')
def is_modulated(v, threshold=0):
return v < threshold
rspm['omission_modulated'] = rspm['omissions'].map(lambda v:is_modulated(v, threshold=0))
rspm['running_modulated'] = rspm['running'].map(lambda v:is_modulated(v, threshold=0))
It looks like they cluster almost perfectly by omission modulation!
fig,ax=plt.subplots(2,2,figsize=(15,12),sharey=True,sharex=True)
sns.scatterplot(
x="tsne-2d-one",
y="tsne-2d-two",
hue="cre_line",
palette=sns.color_palette("hls", 3),
data=rspm,
legend="full",
alpha=0.3,
ax=ax[0,0]
)
sns.scatterplot(
x="tsne-2d-one",
y="tsne-2d-two",
hue="session_type",
palette=sns.color_palette("hls", 8),
data=rspm,
legend="full",
alpha=0.3,
ax=ax[0,1]
)
sns.scatterplot(
x="tsne-2d-one",
y="tsne-2d-two",
hue="k-means_cluster_id",
palette=sns.color_palette("hls", 4),
data=rspm,
legend="full",
alpha=0.3,
ax=ax[1,0]
)
sns.scatterplot(
x="tsne-2d-one",
y="tsne-2d-two",
hue='omission_modulated',
palette=sns.color_palette("hls", 2),
data=rspm,
legend="full",
alpha=0.3,
ax=ax[1,1]
)
fig.tight_layout()
for ii,hue in enumerate(['cre_line','session_type','k-means label', 'omission modulated']):
ax.flatten()[ii].set_title('color by {}'.format(hue))
fig.tight_layout()
fig,ax=plt.subplots(2,2,figsize=(20,12),sharey=True,sharex=True)
sns.scatterplot(
x="tsne-2d-one",
y="tsne-2d-two",
hue="session_id",
palette=sns.color_palette("hls", 4),
data=rspm, #.query('cre_line=="Vip-IRES-Cre"'),
legend='full',
alpha=0.3,
ax=ax[0,0]
)
ax[0,0].set_title('all cre lines')
sns.scatterplot(
x="tsne-2d-one",
y="tsne-2d-two",
hue="session_id",
palette=sns.color_palette("hls", 4),
data=rspm.query('cre_line=="Vip-IRES-Cre"'),
legend='full',
alpha=0.3,
ax=ax[0,1]
)
ax[0,1].set_title('VIP only')
sns.scatterplot(
x="tsne-2d-one",
y="tsne-2d-two",
hue="session_id",
palette=sns.color_palette("hls", 4),
data=rspm.query('cre_line=="Slc17a7-IRES2-Cre"'),
legend='full',
alpha=0.3,
ax=ax[1,0]
)
ax[1,0].set_title('Slc17a7 only')
sns.scatterplot(
x="tsne-2d-one",
y="tsne-2d-two",
hue="session_id",
palette=sns.color_palette("hls", 4),
data=rspm.query('cre_line=="Sst-IRES-Cre"'),
legend='full',
alpha=0.3,
ax=ax[1,1]
)
ax[1,1].set_title('Sst only')
fig,ax=plt.subplots(1,1,figsize=(10,6),sharey=True,sharex=True)
sns.scatterplot(
x="tsne-2d-one",
y="tsne-2d-two",
hue='omissions',
palette='magma',
data=rspm.query('omissions>-1 and omissions<1'),
legend=False,
alpha=0.3,
ax=ax
)
It kinda works
kmeans = KMeans(n_clusters=2)
kmeans.fit(rspm[cols_for_clustering])
tsne_cols = ['tsne-2d-one','tsne-2d-two']
rspm['k-means_on_tSNE'] = kmeans.fit_predict(rspm[tsne_cols])
rspm['k-means_on_tSNE'].value_counts()
fig,ax=plt.subplots(1,1,figsize=(10,6),sharey=True,sharex=True)
sns.scatterplot(
x="tsne-2d-one",
y="tsne-2d-two",
hue='k-means_on_tSNE',
palette=sns.color_palette("hls", 2),
data=rspm,
legend='full',
alpha=0.3,
ax=ax,
)
It's also assigning to a single cluster
# # import hierarchical clustering libraries
# import scipy.cluster.hierarchy as sch
# from sklearn.cluster import AgglomerativeClustering
# # create clusters
# hc = AgglomerativeClustering(n_clusters=4, affinity = 'euclidean', linkage = 'ward')
# # save clusters for chart
# y_hc = hc.fit_predict(rspm[cols_for_clustering])
# rspm['hc'] = y_hc
# rspm['hc'].value_counts()
import umap
min_dist = 0
spread=5
n_neighbors = 10
metric = 'euclidean'
reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=min_dist, metric=metric, spread=spread)
mapper = reducer.fit(rspm[cols_for_clustering])
embedding = reducer.fit_transform(rspm[cols_for_clustering])
embedding.shape
mapper.embedding_.shape
# save_dir = r'\\allen\programs\braintv\workgroups\nc-ophys\visual_behavior\clustering\UMAP_output'
# filepath = os.path.join(save_dir, '200810_UMAP_10D_GLM_fraction_change_in_explained_variance')
# np.save(file=filepath, arr=embedding_3d)
# filepath = os.path.join(save_dir, '200810_UMAP_3D_GLM_fraction_change_in_explained_variance')
# np.save(file=filepath, arr=embedding_3d)
# save_dir = r'\\allen\programs\braintv\workgroups\nc-ophys\visual_behavior\clustering\UMAP_output'
# filepath = os.path.join(save_dir, '200810_UMAP_2D_GLM_fraction_change_in_explained_variance.npy')
# data = np.load(filepath)
rspm['umap_embedding_0'] = embedding[:, 0]
rspm['umap_embedding_1'] = embedding[:, 1]
# rspm['umap_embedding_2'] = embedding[:, 2]
umap_cols = ['umap_embedding_0', 'umap_embedding_1']
only two clusters since the 2d umap gives two clear clusters k-means seems to identify them perfectly!
kmeans = KMeans(n_clusters=3)
umap_cols = ['umap_embedding_0','umap_embedding_1']
rspm['k-means_on_umap'] = kmeans.fit_predict(rspm[umap_cols])
rspm['k-means_on_umap'].value_counts()
k-means clearly didn't do a good job of identifying two of the three clusters
fig,ax=plt.subplots(1,1,figsize=(10,6),sharey=True,sharex=True)
sns.scatterplot(
x='umap_embedding_0',
y='umap_embedding_1',
hue='k-means_on_umap',
palette=sns.color_palette("hls", 3),
data=rspm,
legend='full',
alpha=0.3,
ax=ax,
)
import seaborn as sns
fig,ax=plt.subplots(2,2,figsize=(15,12),sharey=True,sharex=True)
sns.scatterplot(
x='umap_embedding_0',
y='umap_embedding_1',
hue="cre_line",
palette=sns.color_palette("hls", 3),
data=rspm,
legend="full",
alpha=0.1,
ax=ax[0,0]
)
sns.scatterplot(
x='umap_embedding_0',
y='umap_embedding_1',
hue="session_id",
palette=sns.color_palette("hls", 4),
data=rspm,
legend="full",
alpha=0.1,
ax=ax[0,1]
)
sns.scatterplot(
x='umap_embedding_0',
y='umap_embedding_1',
hue='k-means_on_umap',
palette=sns.color_palette("hls", len(rspm['k-means_on_umap'].unique())),
data=rspm,
legend="full",
alpha=0.1,
ax=ax[1,0]
)
sns.scatterplot(
x='umap_embedding_0',
y='umap_embedding_1',
hue='omission_modulated',
palette=sns.color_palette("hls", 2),
data=rspm,
legend="full",
alpha=0.1,
ax=ax[1,1]
)
fig.tight_layout()
for ii,hue in enumerate(['cre_line','session_type','k-means label', 'omission modulated']):
ax.flatten()[ii].set_title('color by {}'.format(hue))
fig.tight_layout()
# import plotly.express as px
# fig = px.scatter_3d(
# rspm,
# x='umap_embedding_0',
# y='umap_embedding_1',
# z='umap_embedding_2',
# color='k-means_on_umap'
# )
# fig.show()
fig,ax=plt.subplots(1,1,figsize=(10,6),sharey=True,sharex=True)
ax = sns.scatterplot(
x='umap_embedding_0',
y='umap_embedding_1',
hue='omissions',
palette='magma',
data=rspm.query('omissions>-1 and omissions<0.25'),
legend=False,
alpha=0.3,
ax=ax,
)
# plt.colorbar(cax=ax, title='omission_modulation')